www.gusucode.com > VC++ 编写的C语言的词法分析源码程序 > VC++ 编写的C语言的词法分析源码程序/code/lexer/lexer.cpp

    /*
 * C语言的词法分析程序
 * 
 */

/*
例子程序:
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[])
{
    int				a_value = 52357;
    int				b_value = 0x1234abcd;
    long long		c_value = 1234LL;
    unsigned int	d_value = 1234U;
    long			e_value	= 5464L;
    unsigned long	f_value = 6384762UL;

    double			g_value = 0.1234;

    char			j_value = 'a';
    char			k_value = '\n';

    char			*l_value = "a string \"";

    a_value += 1;
    b_value = 1 + 1 * 2 / 3 % 4;

    while (true)
    {
        if (1)
            break;
    }

    printf("%d", a_value);

    return 0;
}

 */
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <cctype>
#include <vector>

using namespace std;

/*
 * 词的类型
 */
enum WordType
{
    eKey,			// 关键字
    eIdent,			// 标记符
    eFloat,			// 浮点型
    eFloatScie,		// 科学计数法的浮点数
    eIntDec,		// 整数
    eIntUnsigned,	// 无符号整数
    eIntHex,		// 16进制表示的整数
    eLong,			// 长整型
    eLongUnsigned,	// 无符号长整型
    eLongLong,		// long long类型
    eChar,			// 字符
    eString,		// 字符串
    eMacro,			// 宏指示符
    eOperator,		// 操作符
    eDelimiter,		// 界符
    eInclude,		// include语句
    eHeader,		// 头文件
    eError			// 错误格式
};

/*
 * 用于存放词类型,及其说明
 */
struct WordTypeList
{
    WordType type;
    string text;

};

/*
 * 词类型的列表,用于输出词的说明文字
 */
struct WordTypeList wordTypeList[] = 
{
    {eKey,			"关键字"},
    {eIdent,		"标记符"},
    {eFloat,		"浮点型"},
    {eFloatScie,	"科学计数法的浮点数"},
    {eIntDec,		"整数"},
    {eIntUnsigned,	"无符号整数"},
    {eIntHex,		"16进制表示的整数"},
    {eLong,			"长整型"},
    {eLongUnsigned,	"无符号长整型"},
    {eLongLong,		"long long类型"},
    {eChar,			"字符"},
    {eString,		"字符串"},
    {eMacro,		"宏指示符"},
    {eOperator,		"操作符"},
    {eDelimiter,	"界符"},
    {eInclude,		"include语句"},
    {eHeader,		"头文件"},
    {eError,		"错误格式"}
};

/*
 * C语言关键字
 */
string keyWordList[] =
{
    "auto",
    "break",
    "case",
    "char",
    "const",
    "continue",
    "default",
    "do",
    "double",
    "else",
    "enum",
    "extern",
    "float",
    "for",
    "goto",
    "if",
    "int",
    "long",
    "register",
    "return",
    "short",
    "signed",
    "static",
    "sizeof",
    "struct",
    "switch",
    "typedef",
    "union",
    "unsigned",
    "void",
    "volatile",
    "while",
    ""
};

/*
 * 判断theWord是否是关键字,
 * 如果是返回ture,否则返回false
 */
bool IsKeyWord(const string &theWord)
{	
    // 遍历关键字列表,一个一个比较
    for (int i = 0; !keyWordList[i].empty(); ++i)
    {
        if (theWord.compare(keyWordList[i]) == 0)
            return true;
    }

    return false;
}

/*
 * 获取从pos开始的第一个非空字符的位置,
 * 空格,制表符,换行都会被忽略。
 */
int GetFirstNonNullChar(const string &str, int pos)
{
    while (pos < (int)str.length())
    {
        if (!isspace(str[pos]))
        {
            return pos;
        }
        ++pos;
    }

    return -1;
}

/*
 * 获取位置为pos的字符的下一个字符。
 * 如果到达字符串的结尾,就返回一个为空的string
 */
string GetNextChar(const string &str, int pos)
{
    string theChar;
    int end = str.length();
    
    // pos不应该小于0
    if (pos < 0)
    {
        return theChar;
    }

    ++pos;
    if (pos < end)
    {
        theChar.push_back(str[pos]);
    }

    return theChar;
}

/*
 * 获取从pos开始下一个边界字符所在的位置
 */
int GetBoundary(const string &str, int pos)
{
    string theChar;
    char ch;

    while (true)
    {
        theChar = GetNextChar(str, pos);
        if (theChar.empty())
        {
            return -1;
        }

        ch = theChar[0];
        if (!isalpha(ch) || !isdigit(ch) || ch != '_')
        {
            return pos;
        }
        ++pos;
    }
}

/*
 * 分析标记符
 */
string LexIdentifier(const string &fileText, int wordBegin, int &wordEnd)
{
    string theWord;	
    string nextCh;
    char ch = fileText[wordBegin];

    wordEnd = wordBegin;

    // 标记符必须以英文或者下划线开头
    if (isalpha(ch) || ch == '_')
    {
        do
        {
            theWord.push_back(ch);				

            nextCh = GetNextChar(fileText, wordEnd);
            if (nextCh.empty())
            {
                break;
            }
            ch = nextCh[0];
            ++wordEnd;
        } while (isalpha(ch) || ch == '_'|| isdigit(ch));
    }

    // wordEnd要指向标记符的最后一个字符的位置
    --wordEnd;
    return theWord;
}

/*
 * 分析头文件
 *
 * 支持#include <stdio.h>或者#include "stdio.h",
 * wordBegin必须指向头文件名的前一字符,即'<'或者'"'。
 * 函数返回后,wordEnd指向头文件名的最后一个字符。
 * 函数返回值string包含头文件的名字。
 */
string LexHeader(const string &fileText, int wordBegin, int &wordEnd)
{

    string theWord;
    char ch;
    string nextCh;

    wordEnd = wordBegin;	

    while (true)
    {
        nextCh = GetNextChar(fileText, wordEnd);
        if (nextCh.empty())
        {
            break;
        }

        ch = nextCh[0];
        ++wordEnd;

        if (ch == '>' || ch == '"')
        {
            break;
        }
        theWord.push_back(ch);
    }

    --wordEnd;
    return theWord;
}

/*
 * 分析数值
 */
string LexNumeric(const string &fileText, int wordBegin, int &wordEnd, WordType &type)
{
    string theNum;
    char ch = fileText[wordBegin];	
    char upperCh;
    string nextCh;

    wordEnd = wordBegin;
    

    if (isdigit(ch))
    {
        theNum.push_back(ch);

        nextCh = GetNextChar(fileText, wordEnd);
        if (nextCh.empty())
        {
            goto out;
        }

        ++wordEnd;
        ch = nextCh[0];

        // 16进制的整数
        if (toupper(ch) == 'X')
        {
            do
            {
                theNum.push_back(ch);

                nextCh = GetNextChar(fileText, wordEnd);
                if (nextCh.empty())
                {
                    goto out;
                }

                ++wordEnd;
                ch = nextCh[0];
                upperCh = toupper(ch);

            } while ((upperCh >= '0' && upperCh <= '9')
                || (upperCh >= 'A' && upperCh <= 'F'));

            // 检查整个符号串
            int boundary = GetBoundary(fileText, wordEnd);
            // 注意:wordEnd这里实际已经指向数字串的最后一个字符的后一个字符
            if (wordEnd != boundary)
            {
                // 标记错误,并获取整个字符串
                type = eError;
                string part = fileText.substr(wordEnd, boundary - wordEnd);
                theNum.append(part);
            }

            type = eIntHex;
            --wordEnd;
            return theNum;
        }
        
        // 浮点数和整数
        int cDot = 0;
        while (isdigit(ch) || ch == '.')
        {
            if (ch == '.')
            {
                ++cDot;
                if (cDot > 1)
                {					
                    goto out;
                }
                type = eFloat;
            }
            theNum.push_back(ch);

            nextCh = GetNextChar(fileText, wordEnd);
            if (nextCh.empty())
            {
                goto out;
            }
            ch = nextCh[0];
            ++wordEnd;
        }
        
		if (cDot == 0)
		{
			type = eIntDec;
		}

        if (isalpha(ch))
        {
            if (cDot != 0)
            {
                goto out;
            }

            upperCh = toupper(ch);
            if (upperCh == 'L')
            {
                theNum.push_back(ch);
                
                nextCh = GetNextChar(fileText, wordEnd);
                if (nextCh.empty())
                {
                    goto out;
                }

                ch = nextCh[0];
                ++wordEnd;

                if (isalpha(ch))
                {	
                    upperCh = toupper(ch);
                    theNum.push_back(ch);
                    
                    if (upperCh == 'L')
                    {
                        type = eLongLong;						
                    } else
                    {
                        type = eError;
                        goto out;
                    }
                    
                } else
                {
                    --wordEnd;
                    type = eLong;
                }
            } else if (upperCh == 'U')
            {
                theNum.push_back(ch);
                
                nextCh = GetNextChar(fileText, wordEnd);
                if (nextCh.empty())
                {
                    goto out;
                }
                
                ch = nextCh[0];
                ++wordEnd;
                
                if (isalpha(ch))
                {	
                    upperCh = toupper(ch);
                    theNum.push_back(ch);
                    
                    if (upperCh == 'L')
                    {
                        type = eLongUnsigned;						
                    } else
                    {
                        type = eError;
                        goto out;
                    }
                    
                } else
                {
                    --wordEnd;
					type = eIntUnsigned;
                }
            } else
            {
                goto out;
            }
        } else
        {
            --wordEnd;
        }
         
        return theNum;
    }

// ERROR
out:
    type = eError;
    return theNum;
}

/*
 * 分析字符串
 */
string LexString(const string &fileText, int wordBegin, int &wordEnd)
{
    string theString;
    char ch;
    char prevCh;
	string nextCh;

    wordEnd = wordBegin;
    
	nextCh = GetNextChar(fileText, wordEnd);
	if (nextCh.empty())
	{
		goto out;
	}

	++wordEnd;
	ch = nextCh[0];
    
    do
    {
        theString.push_back(ch);
		nextCh = GetNextChar(fileText, wordEnd);
		if (nextCh.empty())
		{
			goto out;
		}

		++wordEnd;		
        prevCh = ch;
        ch = nextCh[0];

		if (ch == '"' && prevCh != '\\')
		{
			break;
		}
    } while (true);

out:
    --wordEnd;
    return theString;
}

/*
 * 分析字符
 */
string LexChar(const string &fileText, int wordBegin, int &wordEnd)
{
    string theString;
    int end = fileText.length();
	string nextCh;
    char ch;
    char prevCh;

    wordEnd = wordBegin;
    
    nextCh = GetNextChar(fileText, wordEnd);
	if (nextCh.empty())
	{
		goto out;
	}

	++wordEnd;
	ch = nextCh[0];
    
	do
    {
        theString.push_back(ch);
		nextCh = GetNextChar(fileText, wordEnd);
		if (nextCh.empty())
		{
			goto out;
		}

		++wordEnd;
		prevCh = ch;
		ch = nextCh[0];
		if (ch == '\'' && prevCh != '\\')
		{
			break;
		}
	} while (true);

out:
    --wordEnd;
    return theString;
}

/*
 * 读取代码文件
 */
int ReadSourceFile(const string &filePath, string &fileTextBuf)
{
	ifstream ifs(filePath.c_str());
    int length;
    char *buf;

    if (!ifs.good())
    {		
        return -1;
    }

    // 根据文件的大小分配内存
    ifs.seekg(0, ios::end);
    length = (int)ifs.tellg();
    ifs.seekg(0, ios::beg);
    buf = new char[length + 1];

    ifs.getline(buf, length, EOF);
    buf[length] = '\0';
    fileTextBuf = buf;

    delete [] buf;
    ifs.close();

    return 0;
}

/*
 * 将结果写filePath指向的文件
 */
void WriteParseResult(vector<pair<string, WordType>> &resultTable, const string &filePath)
{
	ofstream ofs(filePath.c_str());
    vector<pair<string, WordType>>::iterator iter;

	// 遍历存放结果的resultTable,并将结果写入文件
    for (iter = resultTable.begin(); iter < resultTable.end(); ++iter)
    {
        ofs << (*iter).first.c_str() << "\t\t\t";
        ofs << wordTypeList[(*iter).second].text.c_str() << endl;
    }

    ofs.close();
}

/*
 * 词法分析的主要实现部分
 */
void Parser(vector<pair<string, WordType>> &resultTable, const string fileText)
{
    int currentPos;
    int end;
    char ch;
    char chNext;
    string theWord;

    currentPos = 0;
    end = fileText.length();

    while (true)
    {
        currentPos = GetFirstNonNullChar(fileText, currentPos);
        
        if (currentPos >= end || currentPos < 0)
        {
            break;
        }
        ch = fileText[currentPos];
        if (isalpha(ch) || ch == '_')
        {
            
            theWord = LexIdentifier(fileText, currentPos, currentPos);

            if (IsKeyWord(theWord))
            {
                resultTable.push_back(pair<string, WordType>(theWord, eKey));
            } else
            {
                resultTable.push_back(pair<string, WordType>(theWord, eIdent));
            }
            theWord.clear();
        } else if (isdigit(ch))
        {
            WordType type;
            theWord = LexNumeric(fileText, currentPos, currentPos, type);
            resultTable.push_back(pair<string, WordType>(theWord, type));
            
            theWord.clear();
        } else if (ch == '+')
        {
            chNext = fileText[currentPos + 1];
            if (chNext == '=')
            {
                resultTable.push_back(pair<string, WordType>("+=", eOperator));
                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }
            } else if (chNext == '+')
            {
                resultTable.push_back(pair<string, WordType>("++", eOperator));
                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }
            } else
            {
                resultTable.push_back(pair<string, WordType>("+", eOperator));
            }
            
        } else if (ch == '-')
        {
            if (currentPos+1 >= end || currentPos+1 < 0)
            {
                break;
            }
            chNext = fileText[currentPos + 1];
            if (chNext == '=')
            {
                resultTable.push_back(pair<string, WordType>("-=", eOperator));

                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }
            } else if (chNext == '-')
            {
                resultTable.push_back(pair<string, WordType>("--", eOperator));

                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }
            } else
            {
                resultTable.push_back(pair<string, WordType>("-", eOperator));
            }

        } else if (ch == '*')
        {
            if (currentPos+1 >= end || currentPos+1 < 0)
            {
                break;
            }
            chNext = fileText[currentPos + 1];
            if (chNext == '=')
            {
                resultTable.push_back(pair<string, WordType>("*=", eOperator));
                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }
            } else
            {
                resultTable.push_back(pair<string, WordType>("*", eOperator));
            }
        } else if (ch == '/')
        {
            if (currentPos+1 >= end || currentPos+1 < 0)
            {
                break;
            }
            chNext = fileText[currentPos + 1];
            
            if (chNext == '=')
            {
                resultTable.push_back(pair<string, WordType>("/=", eOperator));
                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }

            } else if (chNext == '/')
            {
                currentPos += 2;
                if (currentPos+2 >= end || currentPos+2 < 0)
                {
                    ++currentPos;
                    break;
                }

                while (fileText[currentPos] != '\n')
                {
                    ++currentPos;
                    if (currentPos >= end || currentPos < 0)
                    {
                        break;
                    }

                }
            } else if (chNext == '*')
            {
                currentPos += 2;
                if (currentPos+2 >= end || currentPos+2 < 0)
                {
                    break;
                }

                while (true)
                {
                    if (currentPos+1 >= end || currentPos+1 < 0)
                    {
                        break;
                    }

                    if (fileText[currentPos] == '*' && fileText[currentPos + 1] == '/')
                    {
                        break;
                    }

                    ++currentPos;
                    if (currentPos >= end || currentPos < 0)
                    {
                        break;
                    }

                }
            } else
            {
                resultTable.push_back(pair<string, WordType>("/", eOperator));
            }

        } else if (ch == '=')
        {
            if (currentPos+1 >= end || currentPos+1 < 0)
            {
                break;
            }

            chNext = fileText[currentPos + 1];
            if (chNext == '=')
            {
                resultTable.push_back(pair<string, WordType>("==", eOperator));
                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }

            } else
            {
                resultTable.push_back(pair<string, WordType>("=", eOperator));
            }
        } else if (ch == '<')
        {
            if (currentPos+1 >= end || currentPos+1 < 0)
            {
                break;
            }

            chNext = fileText[currentPos + 1];
            if (chNext == '=')
            {
                resultTable.push_back(pair<string, WordType>("<=", eOperator));
                
                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }


            } else
            {
                resultTable.push_back(pair<string, WordType>("<", eOperator));
            }
        } else if (ch == '>')
        {
            if (currentPos+1 >= end || currentPos+1 < 0)
            {
                break;
            }

            chNext = fileText[currentPos + 1];
            if (chNext == '=')
            {
                resultTable.push_back(pair<string, WordType>(">=", eOperator));
                ++currentPos;
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }

            } else
            {
                resultTable.push_back(pair<string, WordType>(">", eOperator));
            }
        } else if (ch == '%')
        {
            resultTable.push_back(pair<string, WordType>(">", eOperator));
        } else if (ch == '(')
        {
            resultTable.push_back(pair<string, WordType>("(", eOperator));
        } else if (ch == ')')
        {
            resultTable.push_back(pair<string, WordType>(")", eOperator));
        } else if (ch == '{')
        {
            resultTable.push_back(pair<string, WordType>("{", eDelimiter));
        } else if (ch == '}')
        {
            resultTable.push_back(pair<string, WordType>("}", eDelimiter));
        } else if (ch == '[')
        {
            resultTable.push_back(pair<string, WordType>("[", eOperator));
        } else if (ch == ']')
        {
            resultTable.push_back(pair<string, WordType>("]", eOperator));
        } else if (ch == ',')
        {
            resultTable.push_back(pair<string, WordType>(",", eOperator));
        } else if (ch == ';')
        {
            resultTable.push_back(pair<string, WordType>(";", eDelimiter));
        } else if (ch == '"')
        {
            theWord = LexString(fileText, currentPos, currentPos);
            resultTable.push_back(pair<string, WordType>("\"", eDelimiter));
            resultTable.push_back(pair<string, WordType>(theWord, eString));
            ++currentPos;
            resultTable.push_back(pair<string, WordType>("\"", eDelimiter));
        } else if (ch == '\'')
        {
            theWord = LexChar(fileText, currentPos, currentPos);
            resultTable.push_back(pair<string, WordType>("'", eDelimiter));
            resultTable.push_back(pair<string, WordType>(theWord, eChar));
            ++currentPos;
            resultTable.push_back(pair<string, WordType>("'", eDelimiter));
        } else if (ch == '.')
        {
            resultTable.push_back(pair<string, WordType>(".", eDelimiter));
        } else if (ch == '#')
        {
            resultTable.push_back(pair<string, WordType>("#", eMacro));

            ++currentPos;

            currentPos = GetFirstNonNullChar(fileText, currentPos);
            if (currentPos >= end || currentPos < 0)
            {
                break;
            }
            
            theWord = LexIdentifier(fileText, currentPos, currentPos);

            if (theWord.compare("include") == 0)
            {
                resultTable.push_back(pair<string, WordType>(theWord, eInclude));

                ++currentPos;
                currentPos = GetFirstNonNullChar(fileText, currentPos);
                if (currentPos >= end || currentPos < 0)
                {
                    break;
                }
                ch = fileText[currentPos];

                if (ch == '<' || ch == '"')
                {
                    theWord = ch;
                    resultTable.push_back(pair<string, WordType>(theWord, eDelimiter));

                    theWord = LexHeader(fileText, currentPos, currentPos);
                    resultTable.push_back(pair<string, WordType>(theWord, eHeader));
                    ++currentPos;
                    ch = fileText[currentPos];
                    theWord = ch;
                    resultTable.push_back(pair<string, WordType>(theWord, eDelimiter));
                }
            } else
            {
                
            }

            theWord.clear();

        } else
        {
            
        }
        ++currentPos;
    }
}

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        cerr << "命令使用错误" << endl;
        cerr << "使用方法: lexer <源代码文件名>" << endl;
        exit(1);
    }

    string fileTextBuf;
    string filePath(argv[1]);

    int retval;
    retval = ReadSourceFile(filePath, fileTextBuf);
    if (retval == -1)
    {
        cerr << "文件打开错误,请确保文件路径正确" << endl;
        exit(1);
    }

    vector<pair<string, WordType>> resultTable;

    Parser(resultTable, fileTextBuf);
    WriteParseResult(resultTable, filePath.append(".output.txt"));
    return 0;
}